---
title: "Prophage sequence similarity using ANI and MASH"
author: "Lakhansing Pardeshi"
date: "`r Sys.Date()`"
format:
html:
embed-resources: true
df-print: paged
knitr:
opts_chunk:
fig.height: 7
---
This script process ANI and MASH output for the filtered prophages and stores the
distance matrices and clustering dendrograms.
## Initial setup
```{r}
#| label: setup
#| warning: false
suppressPackageStartupMessages (library (tidyverse))
suppressPackageStartupMessages (library (here))
suppressPackageStartupMessages (library (ape))
suppressPackageStartupMessages (library (treeio))
suppressPackageStartupMessages (library (ggtree))
suppressPackageStartupMessages (library (magrittr))
suppressPackageStartupMessages (library (corrplot))
# summarize prophage ANI and MASH results
# cluster prophages based on ANI distance
rm (list = ls ())
source ("https://raw.githubusercontent.com/lakhanp1/omics_utils/main/RScripts/utils.R" )
source ("scripts/utils/config_functions.R" )
source ("scripts/utils/heatmap_utils.R" )
################################################################################
set.seed (124 )
confs <- prefix_config_paths (
conf = suppressWarnings (configr:: read.config (file = "project_config.yaml" )),
dir = "."
)
pangenome <- confs$ data$ pangenomes$ pectobacterium.v2$ name
panConf <- confs$ data$ pangenomes[[pangenome]]
outDir <- confs$ analysis$ prophages$ dir
```
## Process ANI and MASH output and hierarchical clustering of the data
```{r}
sampleInfo <- get_metadata (file = panConf$ files$ metadata, genus = confs$ genus)
sampleInfoList <- as.list_metadata (
df = sampleInfo, sampleId, sampleName, SpeciesName, strain, nodeLabs, genomeId
)
phages <- suppressMessages (
readr:: read_tsv (confs$ analysis$ prophages$ preprocessing$ files$ filtered)
) %>%
dplyr:: select (
prophage_id, fragments, nFragments, prophage_length, nHg, genomeId
)
nodeOfInterest <- dplyr:: select (phages, prophage_id)
```
### process ANI data for phages and store ANI and ANI-distance matrices
```{r}
aniDf <- suppressMessages (readr:: read_tsv (
file = confs$ data$ prophages$ files$ ani,
col_names = c ("id1" , "id2" , "ani" , "mapped" , "total" )
)) %>%
dplyr:: mutate (
dplyr:: across (
.cols = c (id1, id2),
.fns = ~ stringr:: str_replace (string = .x, pattern = ".*/(.*).fna" , replacement = " \\ 1" )
),
dist = 1 - (ani / 100 )
)
aniDist <- dplyr:: left_join (
x = nodeOfInterest,
y = tidyr:: pivot_wider (
data = aniDf,
id_cols = "id1" ,
names_from = "id2" ,
values_from = "dist" ,
values_fill = 1
),
by = c ("prophage_id" = "id1" )
) %>%
dplyr:: select (prophage_id, all_of (nodeOfInterest$ prophage_id))
readr:: write_tsv (
x = aniDist, file = confs$ analysis$ prophages$ preprocessing$ files$ ani_dist
)
aniDistMat <- tibble:: column_to_rownames (aniDist, var = "prophage_id" ) %>%
as.matrix () %>%
as.dist ()
## store UPGMA and NJ trees
# plot(hclust(distMat))
aniUpgma <- ape:: as.phylo (hclust (d = aniDistMat, method = "complete" )) %>%
ape:: ladderize () %>%
ape:: makeNodeLabel (method = "number" , prefix = "n" )
ape:: write.tree (
phy = aniUpgma, tree.names = "ani_hclust" ,
file = confs$ analysis$ prophages$ preprocessing$ files$ ani_hclust
)
# plot(ape::root(phy = treeUpgma, outgroup = sampleInfoList[[outGroup]]$Genome, edgelabel = TRUE))
# nodelabels()
aniNj <- ape:: nj (aniDistMat) %>%
ape:: ladderize () %>%
ape:: makeNodeLabel (method = "number" , prefix = "n" )
## set negative length edges => 0
aniNj$ edge.length[aniNj$ edge.length < 0 ] <- 0
ape:: write.tree (
phy = aniNj, tree.names = "ANI_NJ" ,
file = confs$ analysis$ prophages$ preprocessing$ files$ ani_nj
)
```
### process MASH data and generate trees
```{r}
mashDf <- suppressMessages (readr:: read_tsv (
file = confs$ data$ prophages$ files$ mash,
col_names = c ("id1" , "id2" , "dist" , "pvalue" , "mapped" )
)) %>%
dplyr:: mutate (
dplyr:: across (
.cols = c (id1, id2),
.fns = ~ stringr:: str_replace (
string = .x, pattern = ".*/(.*).fna" , replacement = " \\ 1"
)
)
)
# mashDf %>%
# dplyr::filter(id1 %in% !!nodeOfInterest$id, id2 %in% !!nodeOfInterest$id) %>%
# dplyr::filter(id1 != id2) %>%
# dplyr::arrange(dist, id1, id2) %>%
# dplyr::left_join(
# y = dplyr::select(prophageDf, id1 = prophage_id, species1 = SpeciesName),
# by = "id1"
# ) %>%
# dplyr::left_join(
# y = dplyr::select(prophageDf, id2 = prophage_id, species2 = SpeciesName),
# by = "id2"
# ) %>%
# clipr::write_clip()
mashDist <- dplyr:: left_join (
x = nodeOfInterest,
y = tidyr:: pivot_wider (
data = mashDf,
id_cols = "id1" ,
names_from = "id2" ,
values_from = "dist" ,
values_fill = 1
),
by = c ("prophage_id" = "id1" )
) %>%
dplyr:: select (prophage_id, all_of (nodeOfInterest$ prophage_id))
readr:: write_tsv (
x = mashDist, file = confs$ analysis$ prophages$ preprocessing$ files$ mash_dist
)
mashDistMat <- tibble:: column_to_rownames (mashDist, var = "prophage_id" ) %>%
as.matrix () %>%
as.dist ()
## store UPGMA and NJ trees
# plot(hclust(distMat))
mashUpgma <- ape:: as.phylo (hclust (d = mashDistMat, method = "complete" )) %>%
ape:: ladderize () %>%
ape:: makeNodeLabel (method = "number" , prefix = "n" )
ape:: write.tree (
phy = mashUpgma, tree.names = "mash_hclust" ,
file = confs$ analysis$ prophages$ preprocessing$ files$ mash_hclust
)
mashNj <- ape:: nj (mashDistMat) %>%
ape:: ladderize () %>%
ape:: makeNodeLabel (method = "number" , prefix = "n" )
## set negative length edges => 0
mashNj$ edge.length[mashNj$ edge.length < 0 ] <- 0
ape:: write.tree (
phy = mashNj, tree.names = "mash_nj" ,
file = confs$ analysis$ prophages$ preprocessing$ files$ mash_nj
)
```
## correlate MASH and ANI trees
```{r}
#| fig-height: 7
#| fig-width: 7
#| out-width: '100%'
## cophenetic distance calculation for NJ and UPGMA tree
copheneticDist <- tibble:: tibble (
aniDist = as.vector (aniDistMat),
aniUpgma = as.vector (as.dist (cophenetic (aniUpgma))),
aniNj = as.vector (as.dist (cophenetic (aniNj))),
mashDist = as.vector (mashDistMat),
mashUpgma = as.vector (as.dist (cophenetic (mashUpgma))),
mashNj = as.vector (as.dist (cophenetic (mashNj)))
)
M <- cor (copheneticDist)
corrplot:: corrplot (
M,
type = "lower" , tl.col = "black" ,
cl.ratio = 0.2 , tl.srt = 45 , col = COL2 ("BrBG" ), addCoef.col = "white"
)
```
## Visualize distance matrices with clustering
### Visualize MASH UPGMA tree
```{r}
treeTbl <- treeio:: as_tibble (mashUpgma) %>%
dplyr:: left_join (y = phages, by = c ("label" = "prophage_id" )) %>%
treeio:: as.treedata ()
pt_treeUpgma <- ggtree:: ggtree (
tr = treeTbl
) +
ggtree:: geom_nodelab (
mapping = aes (label = label),
node = "internal" , size = 3 , hjust = 1.3 , color = "red"
) +
ggtree:: geom_tiplab (
size = 2 , align = TRUE , linesize = 0.5
) +
scale_x_continuous (expand = expansion (mult = c (0.05 , 0.5 ))) +
ggnewscale:: new_scale_color ()
# ggsave(
# plot = pt_treeUpgma, width = 8, height = 20, scale = 2,
# filename = paste(outDir, "/prophages.MASH.UPGMA_tree.pdf", sep = "")
# )
```
:::{.scrolling_y}
```{r}
#| echo: false
#| column: page
#| fig-height: 24
#| fig-width: 10
#| out-width: '100%'
#| layout-valign: top
#|
pt_treeUpgma
```
:::
### Visualize ANI distances
```{r}
ht_ani <- plot_species_ANI_heatmap (
mat = 100 * (1 - as.matrix (aniDistMat)),
name = "ani" ,
phy = aniUpgma, speciesInfo = NULL ,
col = circlize:: colorRamp2 (
breaks = c (0 , 50 , 80 , 90 , 91 , 92 , 93 , 93.5 , 94 , 95 , 96 , 97 , 99 ),
colors = viridisLite:: viridis (n = 13 , option = "B" )
),
heatmap_legend_param = list (
direction = "horizontal" , legend_width = unit (5 , "cm" )
)
)
# pdf(
# file = file.path(outDir, "prophage_ANI_heatmap.pdf"),
# width = 10, height = 10
# )
# ComplexHeatmap::draw(
# object = ht_ani,
# main_heatmap = "ani",
# row_dend_side = "left",
# heatmap_legend_side = "bottom"
# )
# dev.off()
```
```{r}
#| echo: false
#| column: page
#| fig-height: 12
#| fig-width: 12
#| out-width: '100%'
#| layout-valign: top
ComplexHeatmap:: draw (
object = ht_ani,
main_heatmap = "ani" ,
row_dend_side = "left" ,
heatmap_legend_side = "bottom"
)
```
### Visualize MASH distances
```{r}
ht_mash <- plot_species_ANI_heatmap (
mat = as.matrix (mashDistMat),
phy = mashUpgma, speciesInfo = NULL ,
name = "mash" ,
col = circlize:: colorRamp2 (
breaks = c (0 , 0.05 , 0.1 , 0.15 , 0.2 , 0.3 , 0.4 , 0.5 , 0.6 , 0.7 , 0.8 , 0.9 , 1 ),
# breaks = seq(0, 1, length.out = 13),
colors = viridisLite:: viridis (n = 13 , option = "B" )
),
heatmap_legend_param = list (
direction = "horizontal" , legend_width = unit (5 , "cm" )
)
)
# pdf(
# file = file.path(outDir, "prophage_MASH_heatmap.pdf"),
# width = 10, height = 10
# )
# ComplexHeatmap::draw(
# object = ht_mash,
# main_heatmap = "mash",
# row_dend_side = "left",
# heatmap_legend_side = "bottom"
# )
# dev.off()
```
```{r}
#| echo: false
#| column: page
#| fig-height: 12
#| fig-width: 12
#| out-width: '100%'
#| layout-valign: top
ComplexHeatmap:: draw (
object = ht_mash,
main_heatmap = "mash" ,
row_dend_side = "left" ,
heatmap_legend_side = "bottom"
)
```